library(CARBayesST)
library(plotly)
library(tidyverse)
DATA_PATH <- file.path('data')
TRAINING_YEARS <- 2006:2017
VALIDATION_YEAR <- 2018
TEST_YEAR <- 2019
TERRITORY_FIPS <- c('60', '66', '69', '72', '78')
The main data set containing drug overdose death rates by county.
overdose <- read_csv(
file.path(DATA_PATH,
'NCHS_-_Drug_Poisoning_Mortality_by_County__United_States.csv')
) %>%
transmute(
year = Year,
fips = str_pad(FIPS, width = 5, side = 'left', pad = '0'),
county = County,
death_rate = `Model-based Death Rate`
)
The National Bureau of Economic research provides a
csv containing an edge-list representation of counties sharing a
border. We filter to exclude FIPS codes demarcating US territories but
include the District of Columbia. The data are transformed to an
adjacency matrix county_adj_matrix, which we will use to
control for spatial autocorrelation in the model below.
county_adj <- read_csv(
file.path(DATA_PATH, 'county_adjacency2010.csv'),
col_select = c(fipscounty, fipsneighbor)
) %>%
filter(
fipsneighbor != fipscounty,
!str_sub(fipscounty, 1, 2) %in% TERRITORY_FIPS
) %>%
arrange(fipscounty, fipsneighbor)
fips <- unique(county_adj$fipscounty)
county_adj_matrix <- matrix(integer(length(fips)^2), nrow = length(fips),
dimnames = list(fips, fips))
county_adj_matrix[as.matrix(county_adj)] <- 1L
The industry concentration data are derived from the County Business Patterns Survey published by the US Census bureau. In particular, we are looking at the so-called location quotient, which is a ratio of share of employment within a county against the national share. This statistic gives a measure of specialization within a county. Work with feature is exploratory and is subject to change. In particular, we suspect that working directly with the more interpretable share of employment within a county might give similar results. This sensitivity analysis will come in future work.
industry_clusters <- c('Coal.Mining_Cluster.LQ')
has_high_lq <- function(x, p = 0.75) {
return(as.numeric(x >= quantile(x[x > 0], probs = p, na.rm = TRUE)))
}
industry <- read_csv(file.path(DATA_PATH, 'reduced_ind_conc_data.csv')) %>%
mutate(fips = str_pad(region, width = 5, side = 'left', pad = 0)) %>%
select(year, fips, all_of(industry_clusters)) %>%
mutate(across(all_of(industry_clusters), has_high_lq, .names = "high_{.col}"))
The data come from the Census SAIPE Model, which regresses the number of people in poverty on the number of tax returns with gross incomes falling below the official poverty threshold, the number of SNAP benefits in July of the previous year, the estimated resident population as of July 1, the total number of tax return exemptions, and the Census 2000 estimate of the number of people in poverty (within categories).
poverty <- arrow::read_parquet(file.path(DATA_PATH, 'Poverty_Data.parquet')) %>%
transmute(
year = as.numeric(Year),
fips = str_pad(`County FIPS Code`, width = 5, side = 'left', pad = '0'),
state = `State FIPS`,
pov_percent_all = `Poverty Percent, All Ages`,
median_income = `Median Household Income`,
pov_percent_minors = `Poverty Percent, Age 5-17 in Families`
)
df <- expand_grid(year = c(TRAINING_YEARS, VALIDATION_YEAR), fips) %>%
left_join(overdose, by = c('year', 'fips')) %>%
left_join(poverty, by = c('year', 'fips')) %>%
left_join(industry, by = c('year', 'fips')) %>%
mutate(
state = str_sub(fips, 1, 2),
masked_death_rate = case_when(year != VALIDATION_YEAR ~ death_rate),
# Impute missing industry indicators as 0
high_Coal.Mining_Cluster.LQ = replace_na(high_Coal.Mining_Cluster.LQ, 0)
) %>%
group_by(year, state) %>%
mutate(
# Impute missing poverty rates with yearly mean by state
pov_percent_all = replace_na(pov_percent_all,
mean(pov_percent_all, na.rm = TRUE)),
pov_percent_minors = replace_na(pov_percent_minors,
mean(pov_percent_all, na.rm = TRUE)),
median_income = replace_na(median_income,
median(pov_percent_all, na.rm = TRUE)),
) %>%
ungroup() %>%
arrange(year, fips)
head(df)
## # A tibble: 6 x 11
## year fips county death_rate state pov_percent_all median_income
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2006 01001 Autauga County, AL 6.60 01 12.5 46491
## 2 2006 01003 Baldwin County, AL 12.1 01 11 45493
## 3 2006 01005 Barbour County, AL 3.31 01 28.7 28692
## 4 2006 01007 Bibb County, AL 14.3 01 18.8 36794
## 5 2006 01009 Blount County, AL 10.6 01 12.8 41494
## 6 2006 01011 Bullock County, AL 4.76 01 32.9 23095
## # ... with 4 more variables: pov_percent_minors <dbl>,
## # Coal.Mining_Cluster.LQ <dbl>, high_Coal.Mining_Cluster.LQ <dbl>,
## # masked_death_rate <dbl>
We fit the generalized linear mixed model to the data:
\[\begin{align*} Y_{kt} &\sim f(y_{kt} \vert \mu_{kt}, \nu^2)\\ g(\mu_{kt}) &= \mathbf{x}'_{kt} \mathbf{\beta} + \psi_{kt}\\ \mathbf{\beta} &\sim N(\mathbf{\mu}_{\beta}, \mathbf{\Sigma}_{\beta})\\ \psi_{kt} &= \beta_1 + \phi_k + (\alpha + \delta_k) \frac{t - \bar{t}}{N}\\ \phi_k \vert \mathbf{\phi}_{-k}, \mathbf{W} &\sim N \left( \frac{\rho_{int}\sum_{j=1}^K w_{kj} \phi_j}{\rho_{int}\sum_{j=1}^K w_{kj} + 1 - \rho_{int}}, \frac{\tau_{int}^2}{\rho_{int}\sum_{j=1}^K w_{kj} + 1 - \rho_{int}} \right) \\ \delta_k \vert \mathbf{\delta}_{-k}, \mathbf{W} &\sim N \left( \frac{\rho_{slo}\sum_{j=1}^K w_{kj} \delta_j}{\rho_{slo}\sum_{j=1}^K w_{kj} + 1 - \rho_{slo}}, \frac{\tau_{slo}^2}{\rho_{slo}\sum_{j=1}^K w_{kj} + 1 - \rho_{slo}} \right) \\ \tau_{int}^2, \tau_{slo}^2 &\sim \text{Inverse-Gamma}(a,b) \\ \rho_{int}, \rho_{slo} &\sim \text{Uniform}(0, 1) \\ \alpha &\sim N(\mu_{\alpha}, \sigma^2_{\alpha}) \end{align*}\]
where
This model is meant to provide us with a benchmark of run-time and direction on building a diagnostics framework.
poverty_coal_model <- ST.CARlinear(
masked_death_rate ~
median_income +
pov_percent_all +
pov_percent_minors +
high_Coal.Mining_Cluster.LQ,
family = "gaussian", data = df, W = county_adj_matrix,
burnin = 5000, n.sample = 10000, thin = 2
)
df <- df %>%
mutate(
fitted = fitted(poverty_coal_model),
residuals = death_rate - fitted
)
poverty_coal_model$summary.results
As a sanity check, we see that averaged over time, our training residuals averaged over time within counties hover around 0.
counties <- rjson::fromJSON(
file = str_glue('https://raw.githubusercontent.com/',
'plotly/datasets/master/geojson-counties-fips.json'))
plot_ly() %>%
add_trace(
data = df %>%
filter(year %in% TRAINING_YEARS) %>%
group_by(fips) %>%
summarise(residuals = mean(residuals, na.rm = TRUE)),
type = "choropleth",
geojson = counties,
locations = ~fips,
z = ~residuals,
zmin = -4,
zmax = 4,
colors = "PRGn",
marker = list(
line = list(width=0)
),
colorbar = list(
title = "Residual",
tickmode = "array",
tickvals = -4:4,
ticktext = c("<=-4", '-3', '-2', '-1', '0', '1', '2', '3', '>=4')
)
) %>%
layout(
title = "Average (over time) Training Error",
geo = list(
scope = 'usa',
projection = list(type = 'alvers usa')
)
)
Our prediction errors, on the other hand, still shows some spatial clustering with clumps of overprediction and underprediction.
plot_ly() %>%
add_trace(
data = df %>%
filter(year == VALIDATION_YEAR),
type = "choropleth",
geojson = counties,
locations = ~fips,
z = ~residuals,
zmin = -4,
zmax = 4,
colors = "PRGn",
marker = list(
line = list(width=0)
),
colorbar = list(
title = "Residual",
tickmode = "array",
tickvals = -4:4,
ticktext = c("Underprediction", '-3', '-2', '-1', '0', '1', '2', '3', 'Overprediction')
)
) %>%
layout(
title = "Validation Errors",
geo = list(
scope = 'usa',
projection = list(type = 'alvers usa')
)
)
The residuals show signs of heteroskedascity, with higher values exhibiting higher errors. Due to overplotting, we will have to investigate this issue in several ways.
p1 <- plot_ly(df %>% filter(year %in% TRAINING_YEARS)) %>%
add_trace(
name = "Training",
type = "scatter",
mode = "markers",
x = ~death_rate,
y = ~residuals,
text = ~str_glue("{fips}: {year}")
) %>%
layout(
xaxis = list(title = "Death Rate per 100k"),
yaxis = list(title = "Residual")
)
p2 <- plot_ly(df %>% filter(year == VALIDATION_YEAR)) %>%
add_trace(
name = "Validation",
type = "scatter",
mode = "markers",
x = ~death_rate,
y = ~residuals,
text = ~fips
) %>%
layout(
xaxis = list(title = "Death Rate per 100k"),
yaxis = list(title = "Residual")
)
subplot(p1, p2, nrows = 2, shareX = TRUE) %>%
layout(title = "Residuals vs. Actual Values")
We estimate the conditional percentiles of the predictions given the observations. This shows that overall our model tends to underpredict death rates, which would be undesirable from a public health standpoint since it could lead to undersupplying counties that need help.
#' @description Calculates conditional percentiles by binning the data into fixed-
#' width strips using the x-values
#' @param x observations
#' @param y predictions
#' @threshold number of observations to be considered stable
calculate_conditional_quantiles <- function(x, y, n_bins = 10) {
breaks <- seq(floor(min(x, y)), ceiling(max(x, y)), length = n_bins + 1)
labs <- breaks[-length(breaks)] + 0.5*diff(breaks)
x_bins <- cut(x, breaks = breaks, include.lowest = TRUE, labels = labs)
out <- tibble(x_bins = as.numeric(as.character(x_bins)), y) %>%
group_by(x_bins) %>%
summarise(
length = length(y),
quantiles = quantile(y, c(0.1, 0.25, 0.5, 0.75, 0.9)),
probs = factor(c(0.1, 0.25, 0.5, 0.75, 0.9), ordered = TRUE)
)
return(out)
}
# Conditional distributions
p1 <- ggplot() +
geom_point(
aes(death_rate, fitted),
color = "grey20", alpha = 0.05,
data = df %>%
mutate(
subset = ifelse(year %in% TRAINING_YEARS, "Training", "Validation")
)
) +
geom_abline(slope = 1, intercept = 1, size = 1.1) +
geom_line(
aes(x_bins, quantiles, color = probs), size = 1.25,
data = df %>%
filter(!is.na(death_rate)) %>%
group_by(
subset = ifelse(year %in% TRAINING_YEARS, "Training", "Validation")
) %>%
summarise(
calculate_conditional_quantiles(death_rate, fitted, n_bins = 20)
)
) +
coord_fixed() +
facet_grid(cols = vars(subset)) +
scale_x_continuous(limits = c(0, 150)) +
scale_y_continuous(limits = c(0, 100)) +
scale_color_manual(
values = c("#e6194B", "#42d4f4", "#911eb4", "#42d4f4", "#e6194B")
) +
labs(
title = "Conditional Distribution of Fitted Values",
x = "Observed Values",
y = "Fitted Values",
color = "Percentile"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 24),
axis.title = element_text(size = 20),
axis.text = element_text(size = 16),
legend.title = element_text(size = 20),
legend.text = element_text(size = 16),
strip.text = element_text(size = 20),
legend.position = "bottom"
)
ggplotly(p1)
Since the percentile estimates for larger death rates are unstable due to the small number of observations that exhibit such extreme rates, we’ll take a moment to examine our model’s performance on the more “tame” cases. We see that there are years in the middle that our model does fairly well on, but in the tail years, we underpredict for an alarming number of counties (2016-2018).
ggplot() +
geom_point(
aes(death_rate, fitted),
color = "grey20", alpha = 0.05,
data = df
) +
geom_abline(slope = 1, intercept = 1, size = 1.1) +
geom_line(
aes(x_bins, quantiles, color = probs), size = 1.25,
data = df %>%
filter(!is.na(death_rate)) %>%
group_by(
year
) %>%
summarise(
calculate_conditional_quantiles(death_rate, fitted, n_bins = 20)
)
) +
coord_fixed() +
facet_wrap(vars(year)) +
scale_x_continuous(limits = c(0, 40)) +
scale_y_continuous(limits = c(0, 40)) +
scale_color_manual(
values = c("#e6194B", "#42d4f4", "#911eb4", "#42d4f4", "#e6194B")
) +
labs(
title = "Conditional Distribution of Fitted Values",
x = "Observed Values",
y = "Fitted Values",
color = "Percentile"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 24),
axis.title = element_text(size = 20),
axis.text = element_text(size = 16),
legend.title = element_text(size = 20),
legend.text = element_text(size = 16),
strip.text = element_text(size = 20),
legend.position = "bottom"
)